### Prepara variables:

# Here I create variables that will be used in the analysis
data <- as.data.table(read.csv("intermediary_outputs/final_sample.csv"))

## Generate skin color square:
#data$skin_color_sq <- data$skin_color^2

# Religion:
data$catholic <- as.numeric(data$religion_simple == 1)
data$evangelic <- as.numeric(data$religion_simple == 2)

# Dummy for interview and administrative grades
data$interview_dummy <- as.numeric(!is.na(data$id_interview))

data$grades_dummy <- 0 
data$grades_dummy[which(data$missing_wo_edfis == 0)] <- 1

## Whiten, Darken & racial mismatch
data$whiten <- 0
# Brown and Blacks that classified as white
data$whiten[which(data$race == 1 & data$skin_color > 6)] <- 1 
# Blacks that classified as brown
data$whiten[which(data$race == 3 & data$skin_color > 10)] <- 1 
data$whiten[which(is.na(data$race) == T | is.na(data$skin_color) == T)] <- NA
data$whiten[which(data$race > 3  )] <- NA

data$darken <- 0
# Brown and whites that classified as black
data$darken[which(data$race == 2 & data$skin_color <= 10)] <- 1  
# Whites that classified as brown
data$darken[which(data$race == 3 & data$skin_color <= 6)] <- 1 
# Whites that classified as brown
data$darken[which(is.na(data$race) == T | is.na(data$skin_color) == T)] <- NA 
data$darken[which(data$race > 3  )] <- NA

data$race_mismatch <- 0 
data$race_mismatch[which(data$whiten == 1)] <- 1 
data$race_mismatch[which(data$darken == 1)] <- 2 

## Skin groups
data$skin_group <- NA
data$skin_group[which(data$skin_color <=6)] <- 0
data$skin_group[which(data$skin_color >6 & data$skin_color <= 9)] <- 1
data$skin_group[which(data$skin_color > 9)] <- 2

## Dark against light skin color:
data$skin_group_2 <- data$skin_group
data$skin_group_2[which(data$skin_group == 2)] <- 1
table(data$skin_color, data$skin_group_2)

## Aggregating occupations into fewer categories:
data$occ_father_simple <- NA
data$occ_father_simple[which(data$code_egp_father == 1)] <- 1 # Higher Controllers
data$occ_father_simple[which(data$code_egp_father == 2)] <- 1 # Lower controllers
data$occ_father_simple[which(data$code_egp_father == 3)] <- 2 # Routine Nonmanual
data$occ_father_simple[which(data$code_egp_father == 4)] <- 2 # Lower Sales-Servicess
data$occ_father_simple[which(data$code_egp_father == 5)] <- 3 # Selfemployed with employees
data$occ_father_simple[which(data$code_egp_father == 6)] <- 3 # Seldemployed with no employees
data$occ_father_simple[which(data$code_egp_father == 7)] <- 4 # Manual Supervisor
data$occ_father_simple[which(data$code_egp_father == 8)] <- 5 # Skilled Workers
data$occ_father_simple[which(data$code_egp_father == 9)] <- 6 # Unskilled Workers
data$occ_father_simple[which(data$code_egp_father == 10)] <- 7 # Farm Labor
data$occ_father_simple[which(data$code_egp_father == 11)] <- 3 # Selfemployed Farmer

data$occ_father_simple_no_job <- data$occ_father_simple
data$occ_father_simple_no_job[which(data$no_work_father == 1)] <- 8

data$occ_father_simple_no_na <- data$occ_father_simple
data$occ_father_simple_no_na[which(is.na(data$occ_father_simple) == T)] <- 8

data$occ_father_simple_no_job_no_na <- data$occ_father_simple_no_job
data$occ_father_simple_no_job_no_na[
  which(is.na(data$occ_father_simple_no_job) == T)] <- 9

## Aggregating occupations into fewer categories - Mother: 
data$occ_mother_simple <- NA
data$occ_mother_simple[which(data$code_egp_mother == 1)] <- 1 # Higher Controllers
data$occ_mother_simple[which(data$code_egp_mother == 2)] <- 1 # Lower controllers
data$occ_mother_simple[which(data$code_egp_mother == 3)] <- 2 # Routine Nonmanual
data$occ_mother_simple[which(data$code_egp_mother == 4)] <- 2 # Lower Sales-Servicess
data$occ_mother_simple[which(data$code_egp_mother == 5)] <- 3 # Selfemployed with employees
data$occ_mother_simple[which(data$code_egp_mother == 6)] <- 3 # Seldemployed with no employees
data$occ_mother_simple[which(data$code_egp_mother == 7)] <- 4 # Manual Supervisor
data$occ_mother_simple[which(data$code_egp_mother == 8)] <- 5 # Skilled Workers
data$occ_mother_simple[which(data$code_egp_mother == 9)] <- 6 # Unskilled Workers
data$occ_mother_simple[which(data$code_egp_mother == 10)] <- 7 # Farm Labor
data$occ_mother_simple[which(data$code_egp_mother == 11)] <- 3 # Selfemployed Farmer

data$occ_mother_simple_no_job <- data$occ_mother_simple
data$occ_mother_simple_no_job[which(data$no_work_mother == 1)] <- 8

data$occ_mother_simple_no_na <- data$occ_mother_simple
data$occ_mother_simple_no_na[which(is.na(data$occ_mother_simple) == T)] <- 8

data$occ_mother_simple_no_job_no_na <- data$occ_mother_simple_no_job
data$occ_mother_simple_no_job_no_na[
  which(is.na(data$occ_mother_simple_no_job) == T)] <- 9

data[, branco := as.numeric(race == 1)]
data[, pardo := as.numeric(race == 2)]
data[, preto := as.numeric(race == 3)]

## Classroom characteristics:
avg_class_races <- data %>% group_by(class_id) %>%
  summarise(n(), "white" = mean(branco, na.rm = T), 
            "brown" = mean(pardo, na.rm = T), 
            "black" = mean(preto, na.rm = T), 
            "skin" = mean(skin_color, na.rm = T),
            "male" = mean(male, na.rm = T), 
            "male_imp" = mean(male_imputed, na.rm = T),
            "missing_interview" = mean(interview_dummy, na.rm = T),
            "mean_grades_score" = NA, 
            "max_grades_score" = NA,
            "min_grades_score" = NA,
            "median_grades_score" = NA,
            "avg_skin_class" = mean(skin_color, na.rm = T),
            "avg_score_racism_class" = mean(score_racism, na.rm = T),
            "avg_score_parents_support_class" = mean(score_parents_support, na.rm = T),
            "avg_score_study_class" = mean(score_study, na.rm = T),
            "avg_score_self_esteem_class" = mean(score_self_esteem, na.rm = T),
            "avg_scores_poverty_class" = mean(scores_poverty, na.rm = T),
            "avg_scores_sdo_1_class" = mean(scores_sdo_1, na.rm = T),
            "avg_scores_sdo_2_class" = mean(scores_sdo_2, na.rm = T),
            "avg_score_homophobia_class" = mean(score_homophobia, na.rm = T),
            "avg_dominance_score_class" = mean(dominance_score, na.rm = T),
            "avg_anti_equality_score_class" = mean(anti_equality_score, na.rm = T),
            "avg_score_climate_school_class" = mean(score_climate_school, na.rm = T),
            "avg_score_violence_school_class" = mean(score_violence_school, na.rm = T),
            "avg_score_neighborhood_quality_class" = mean(score_neighborhood_quality, na.rm = T),
  )

data$class_size <- NA
data$sh_white_class <- NA
data$sh_brown_class <- NA
data$sh_black_class <- NA
data$sh_nw_class <- NA
data$skin_color_class <- NA
data$sh_males_class <- NA
data$sh_males_imputed_class <- NA
data$sh_missing_interview_class <- NA

data$avg_grades_score_class <- NA
data$max_grades_score_class <- NA
data$min_grades_score_class <- NA
data$median_grades_score_class <- NA
data$avg_skin_class <- NA
data$avg_score_racism_class <- NA
data$avg_score_parents_support_class <- NA
data$avg_score_study_class <- NA
data$avg_score_self_esteem_class <- NA
data$avg_scores_poverty_class <- NA
data$avg_scores_sdo_1_class <- NA
data$avg_scores_sdo_2_class <- NA
data$avg_score_homophobia_class <- NA
data$avg_dominance_score_class <- NA
data$avg_anti_equality_score_class <- NA
data$avg_score_climate_school_class <- NA
data$avg_score_violence_school_class <- NA
data$avg_score_overall_violence_school_class <- NA
data$avg_score_neighborhood_quality_class <- NA

for (i in 1:nrow(avg_class_races)) {
  data$class_size[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,2])     
  data$sh_white_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,3])     
  data$sh_brown_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,4])
  data$sh_black_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,5])
  data$sh_nw_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,4] + avg_class_races[i,5])
  data$skin_color_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,6])
  data$sh_males_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,7])
  data$sh_males_imputed_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,8])
  data$sh_missing_interview_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,9])
  data$avg_grades_score_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,10])
  data$max_grades_score_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,11])
  data$min_grades_score_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,12])
  data$median_grades_score_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,13])
  data$avg_skin_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,14])
  data$avg_score_racism_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,15])
  data$avg_score_parents_support_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,16])
  data$avg_score_study_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,17])
  data$avg_score_self_esteem_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,18])
  data$avg_scores_poverty_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,19])
  data$avg_scores_sdo_1_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,20])
  data$avg_scores_sdo_2_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,21])
  data$avg_score_homophobia_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,22])
  data$avg_dominance_score_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,23])
  data$avg_anti_equality_score_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,24])
  data$avg_score_climate_school_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,25])
  data$avg_score_violence_school_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,26])
  data$avg_score_neighborhood_quality_class[which(data$class_id == i)] <- 
    as.numeric(avg_class_races[i,27])
  
}

saveRDS(data, "intermediary_outputs/data_out_3.Rds")
